ex4-NN back propagation

AndrewNg 机器学习习题ex4-NN back propagation

练习用数据

需要的头:

1
2
3
4
5
6
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import matplotlib
import scipy.optimize as opt
from sklearn.metrics import classification_report # 这个包是评价报告

Visualizing the data

载入数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def load_data(path, transpose=True):
data = sio.loadmat(path)
y = data.get('y')
y = y.reshape(y.shape[0])
X = data.get('X')
if transpose:
X = np.array([im.reshape((20, 20)).T for im in X])
X = np.array([im.reshape(400) for im in X])
return X, y


X, y = load_data('./data/ex4data1.mat')


def plot_100_image(X):
size = int(np.sqrt(X.shape[1]))
sample_idx = np.random.choice(np.array(X.shape[0]), 100)
sample_images = X[sample_idx, :]

fig, ax_array = plt.subplots(nrows=10, ncols=10, sharey=True, sharex=True, figsize=(8, 8))

for r in range(10):
for c in range(10):
ax_array[r, c].matshow(sample_images[10 * r + c].reshape((size, size)), cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))
plt.show()
plot_100_image(X)

image

准备数据

特征集合X添加一列全为1的偏差向量,把目标向量y进行OneHot编码。

1
2
3
4
5
6
7
8
X_raw, y_raw = load_data('./data/ex4data1.mat', transpose=False) # 这里转置
X = np.insert(X_raw, 0, np.ones(X_raw.shape[0]), axis=1) # 增加全为1的一列
print(y.shape) # (5000,)
y = np.array([y_raw]).T
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
print(y_onehot.shape) # (5000, 10)

读取权重

先读取出ex4weights.mat中的theta1和theta2,把theta展开后进行扁平化处理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def load_weight(path):
data = sio.loadmat(path)
return data['Theta1'], data['Theta2']


t1, t2 = load_weight('./data/ex4weights.mat')
print(t1.shape, t2.shape) # (25, 401) (10, 26)


def serialize(a, b):
# np.ravel() 降维
# np.concatenate() 拼接
return np.concatenate((np.ravel(a), np.ravel(b)))


def deserialize(seq):
# 解开为两个theta
return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)


theta = serialize(t1, t2)
print(theta.shape) # (25 * 401) + (10 * 26) = 10285

前向传播 feed forward

(400 + 1) -> (25 + 1) -> (1)
image

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def sigmoid(z):
return 1 / (1 + np.exp(-z))


def feed_forward(theta, X):
t1, t2 = deserialize(theta)
m = X.shape[0]

a1 = X # 5000 * 401
z2 = a1 @ t1.T
a2 = np.insert(sigmoid(z2), 0, np.ones(m), axis=1) # 5000*26 第一列加一列一
z3 = a2 @ t2.T # 5000 * 100
h = sigmoid(z3) # 5000 * 10 这是 h_theta(X)

return a1, z2, a2, z3, h # 把每一层的计算都返回

#_, _, _, _, h = feed_forward(theta, X)

#print(h.shape) # (5000, 10)

代价函数与正则化

image

image

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def cost(theta, X, y):
m = X.shape[0]
_, _, _, _, h = feed_forward(theta, X)
pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))
return pair_computation.sum() / m


cost_res = cost(theta, X, y)
print("cost:",cost_res)


def regularized_cost(theta, X, y, l=1):
t1, t2 = deserialize(theta) # t1: (25,401) t2: (10,26)
m = X.shape[0]

reg_t1 = np.power(t1[:, 1:], 2).sum()
reg_t2 = np.power(t2[:, 1:], 2).sum()
reg = (1 / (2 * m)) * (reg_t1 + reg_t2)

return cost(theta, X, y) + reg


regularized_cost_res = regularized_cost(theta, X, y)
print("reg cost:",regularized_cost_res)

反向传播

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def sigmoid_gradient(z):
return np.multiply(sigmoid(z), 1 - sigmoid(z))


print(sigmoid_gradient(0)) #0.25


def gradient(theta, X, y):
t1, t2 = deserialize(theta)
m = X.shape[0]
deltal = np.zeros(t1.shape)
delta2 = np.zeros(t2.shape)

a1, z2, a2, z3, h = feed_forward(theta, X)

for i in range(m):
a1i = a1[i, :]
z2i = z2[i, :]
a2i = a2[i, :]

hi = h[i, :]
yi = y[i, :]

d3i = hi - yi

z2i = np.insert(z2i, 0, np.ones(1))
d2i = np.multiply(t2.T @ d3i, sigmoid_gradient(z2i))

delta2 += np.matrix(d3i).T @ np.matrix(a2i)
deltal += np.matrix(d2i[1:]).T @ np.matrix(a1i)


delta1 = deltal / m
delta2 = delta2 / m

return serialize(delta1, delta2)


d1, d2 = deserialize(gradient(theta, X, y))
print(d1.shape, d2.shape) # (25, 401) (10, 26)

梯度校验

image

梯度正则化:

image

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def regularized_gradient(theta, X, y, l=1):
"""don't regularize theta of bias terms"""
m = X.shape[0]
delta1, delta2 = deserialize(gradient(theta, X, y))
t1, t2 = deserialize(theta)

t1[:, 0] = 0
reg_term_d1 = (l / m) * t1
delta1 = delta1 + reg_term_d1

t2[:, 0] = 0
reg_term_d2 = (l / m) * t2
delta2 = delta2 + reg_term_d2

return serialize(delta1, delta2)

def expand_array(arr):
"""replicate array into matrix
[1, 2, 3]

[[1, 2, 3],
[1, 2, 3],
[1, 2, 3]]
"""
# turn matrix back to ndarray
return np.array(np.matrix(np.ones(arr.shape[0])).T @ np.matrix(arr))

def gradient_checking(theta, X, y, epsilon, regularized=False):
def a_numeric_grad(plus, minus, regularized=False):
"""calculate a partial gradient with respect to 1 theta"""
if regularized:
return (regularized_cost(plus, X, y) - regularized_cost(minus, X, y)) / (epsilon * 2)
else:
return (cost(plus, X, y) - cost(minus, X, y)) / (epsilon * 2)

theta_matrix = expand_array(theta) # expand to (10285, 10285)
epsilon_matrix = np.identity(len(theta)) * epsilon

plus_matrix = theta_matrix + epsilon_matrix
minus_matrix = theta_matrix - epsilon_matrix

# calculate numerical gradient with respect to all theta
numeric_grad = np.array([a_numeric_grad(plus_matrix[i], minus_matrix[i], regularized)
for i in range(len(theta))])

# analytical grad will depend on if you want it to be regularized or not
analytic_grad = regularized_gradient(theta, X, y) if regularized else gradient(theta, X, y)

# If you have a correct implementation, and assuming you used EPSILON = 0.0001
# the diff below should be less than 1e-9
# this is how original matlab code do gradient checking
diff = np.linalg.norm(numeric_grad - analytic_grad) / np.linalg.norm(numeric_grad + analytic_grad)

print('If your backpropagation implementation is correct,\nthe relative difference will be smaller than 10e-9 (assume epsilon=0.0001).\nRelative Difference: {}\n'.format(diff))


# gradient_checking(theta, X, y, epsilon= 0.0001)#这个运行很慢,谨慎运行

If your backpropagation implementation is correct,
the relative difference will be smaller than 10e-9 (assume epsilon=0.0001).
Relative Difference: 2.1466000818218673e-09

准备训练模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def random_init(size):
return np.random.uniform(-0.12, 0.12, size)

def nn_training(X, y):
init_theta = random_init(10285) # 25 * 401 + 10 * 26

res = opt.minimize(fun=regularized_cost,
x0=init_theta,
args=(X ,y, 1),
method='TNC',
jac=regularized_gradient,
options={'maxiter': 400})
return res

res = nn_training(X, y) # 慢
print(res)

Out put:

1
2
3
4
5
6
7
8
9
10
    fun: 0.32211992072588747
jac: array([ 2.15004329e-04, 3.88985627e-08, -3.33174201e-08, ...,
3.15328424e-05, 2.82831419e-05, -1.68082404e-05])
message: 'Max. number of function evaluations reached'
nfev: 400
nit: 26
status: 3
success: False
x: array([ 0.00000000e+00, 1.94492814e-04, -1.66587101e-04, ...,
-7.15493763e-01, -1.36561388e+00, -2.90127262e+00])

显示准确率

1
2
3
4
5
6
7
8
9
10
11
12
_, y_answer = load_data('./data/ex4data1.mat')

final_theta = res.x

def show_accuracy(theta, X, y):
_, _, _, _, h = feed_forward(theta, X)

y_pred = np.argmax(h, axis=1) + 1

print(classification_report(y, y_pred))

show_accuracy(final_theta, X, y_answer)

Out Put:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
             precision    recall  f1-score   support

1 1.00 0.79 0.88 500
2 0.73 1.00 0.85 500
3 0.82 0.99 0.89 500
4 1.00 0.89 0.94 500
5 1.00 0.86 0.92 500
6 0.94 0.99 0.97 500
7 0.99 0.81 0.89 500
8 0.94 0.95 0.95 500
9 0.96 0.95 0.95 500
10 0.96 0.98 0.97 500

avg / total 0.93 0.92 0.92 5000

显示隐藏层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def plot_hidden_layer(theta):
"""
theta: (10285, )
"""
final_theta1, _ = deserialize(theta)
hidden_layer = final_theta1[:, 1:] # ger rid of bias term theta

fig, ax_array = plt.subplots(nrows=5, ncols=5, sharey=True, sharex=True, figsize=(5, 5))

for r in range(5):
for c in range(5):
ax_array[r, c].matshow(hidden_layer[5 * r + c].reshape((20, 20)),
cmap=matplotlib.cm.binary)
plt.xticks(np.array([]))
plt.yticks(np.array([]))

plot_hidden_layer(final_theta)
plt.show()

image

0%